This script will include all steps in the “main pipeline” part of my thesis project. This includes differential analysis of the reference airway current vs never smoker dataset (A1, GSE63127), differential expression analysis of the TCGA-LUAD lung adenocarcinoma expression and methylation datasets, and the reference “persistent” airway current vs former vs never smoker dataset (A2, GSE7895). This includes all normalization, quality control, and filtering steps.

Notes: - 2024/12/04: For this version of the pipeline, I am using limma for A1, DESeq2 for T-E, and limma for T-M, which uses the cbioportal-derived data. I have gathered that the Wilcoxon signed-rank test is better for smaller sample sizes (I can back this up with sources later). I am also experimenting with applying stringent FDR cutoffs and applying log2FC cutoffs based on how they affect the histograms of expression distribution (keeping the more normally distributed bits). Down the line I may want to use the probewise processing of T-M instead of the cbioportal version, as they do not provide a good explanation of their preprocessing.

Loading libraries

library(EnhancedVolcano, verbose = FALSE)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(GEOquery, verbose = FALSE)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma, verbose = FALSE)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(umap, verbose = FALSE)
library(dplyr, verbose = FALSE)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1. Differential expression analysis of reference airway current vs never smoker dataset (A1, GSE63127)

1.1 Loading dataset

# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
#   Differential expression analysis with limma


# load series and platform data from GEO (date: 2024/10/15)
gset <- getGEO("GSE63127", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE63127_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group membership for all samples
gsms <- paste0("X00100011111X00000000000X00000X000000000000X000X00",
               "XX00X0XXXXXXXXX1111111111111111111111X111X11111111",
               "XXXX1XXXXXXXXXXXXXXXXXXXXXXXX001000000010100111111",
               "01110111110011111001110011101011111001110101100011",
               "111111111111111111111111111111")
sml <- strsplit(gsms, split="")[[1]]

# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
length(sml) # 182 samples
## [1] 182

1.2 Quality control checks and normalization

## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##
hist(as.matrix(exprs(gset))) # skewed left, needs log2 transform

boxplot(exprs(gset)) # scary-looking

max(exprs(gset))
## [1] 136808
min(exprs(gset))
## [1] 0.0657913
# Should do log2 normalization

## log2 normalization ##
exprs(gset) <- log2(exprs(gset)+1)

# quantile normalization: no longer doing this for now to better capture variation
##exprs(gset) <- normalizeBetweenArrays(exprs(gset)) 


hist(as.matrix(exprs(gset))) # much better

boxplot(exprs(gset)) # Looks reasonable

min(exprs(gset)) 
## [1] 0.09192496
max(exprs(gset)) 
## [1] 17.0618

1.3 Checking and correcting batch effect / sources of variation

1.3.1: Download and clean up the phenotypic information table from the dataset

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("non-smoker","smoker"))
levels(gs) <- groups
gset$group <- gs

phenotypic_data <- pData(gset)  # Extract phenotypic data
head(phenotypic_data, n = 3)
##                                   title geo_accession                status
## GSM190150 Small airways, non-smoker 029     GSM190150 Public on Dec 16 2008
## GSM190153 Small airways, non-smoker 036     GSM190153 Public on Jun 17 2008
## GSM254157     small airways, smoker 112     GSM254157 Public on Jun 17 2008
##           submission_date last_update_date type channel_count
## GSM190150     May 17 2007      Aug 28 2018  RNA             1
## GSM190153     May 17 2007      Aug 28 2018  RNA             1
## GSM254157     Jan 03 2008      Aug 28 2018  RNA             1
##                                                         source_name_ch1
## GSM190150 airway epithelial cells obtained by bronchoscopy and brushing
## GSM190153 airway epithelial cells obtained by bronchoscopy and brushing
## GSM254157 airway epithelial cells obtained by bronchoscopy and brushing
##           organism_ch1 characteristics_ch1 characteristics_ch1.1
## GSM190150 Homo sapiens             age: 34                sex: M
## GSM190153 Homo sapiens             age: 45                sex: F
## GSM254157 Homo sapiens             age: 45                sex: M
##            characteristics_ch1.2                 characteristics_ch1.3
## GSM190150    ethnic group: black            smoking status: non-smoker
## GSM190153 ethnic group: hispanic            smoking status: non-smoker
## GSM254157    ethnic group: white smoking status: smoker, 23 pack-years
##           molecule_ch1
## GSM190150    total RNA
## GSM190153    total RNA
## GSM254157    total RNA
##                                                                                                      extract_protocol_ch1
## GSM190150 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM190153 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
## GSM254157 Trizol extraction and RNAeasy clean-up of total RNA was performed according to the manufacturer's instructions.
##           label_ch1
## GSM190150    biotin
## GSM190153    biotin
## GSM254157    biotin
##                                                                                                                                                                  label_protocol_ch1
## GSM190150   Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM190153   Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 3 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
## GSM254157 Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 1-2 microg total RNA (Expression Analysis Technical Manual, 701022 Rev.2, Affymetrix).
##           taxid_ch1
## GSM190150      9606
## GSM190153      9606
## GSM254157      9606
##                                                                                                                                                                                  hyb_protocol
## GSM190150 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM190153 Following fragmentation, 15 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
## GSM254157 Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on GeneChip HG-U133 Plus 2.0. GeneChips were washed and stained in the Affymetrix Fluidics Station 450.
##                                                        scan_protocol
## GSM190150 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM190153 GeneChips were scanned using the GeneChip Scanner 3000 7G.
## GSM254157 GeneChips were scanned using the GeneChip Scanner 3000 7G.
##                         description
## GSM190150 small airways, non-smoker
## GSM190153 small airways, non-smoker
## GSM254157 small airways, smoker 112
##                                                                                                                                                     data_processing
## GSM190150 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM190153 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
## GSM254157 The data were analyzed with Microarray Suite version 5.0 (MAS 5.0) using Affymetrix default analysis settings and global scaling as normalization method.
##           platform_id           contact_name           contact_email
## GSM190150      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM190153      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
## GSM254157      GPL570 Yael,,Strulovici-Barel yas2003@med.cornell.edu
##           contact_laboratory             contact_department
## GSM190150            Crystal Department of Genetic Medicine
## GSM190153            Crystal Department of Genetic Medicine
## GSM254157            Crystal Department of Genetic Medicine
##                       contact_institute  contact_address contact_city
## GSM190150 Weill Cornell Medical College 1300 York Avenue     New York
## GSM190153 Weill Cornell Medical College 1300 York Avenue     New York
## GSM254157 Weill Cornell Medical College 1300 York Avenue     New York
##           contact_state contact_zip/postal_code contact_country
## GSM190150            NY                   10021             USA
## GSM190153            NY                   10021             USA
## GSM254157            NY                   10021             USA
##                                                                          supplementary_file
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CEL.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CEL.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CEL.gz
##                                                                        supplementary_file.1
## GSM190150 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190150/suppl/GSM190150.CHP.gz
## GSM190153 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM190nnn/GSM190153/suppl/GSM190153.CHP.gz
## GSM254157 ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254157/suppl/GSM254157.CHP.gz
##           data_row_count                 relation               relation.1
## GSM190150          54675 Reanalyzed by: GSE119087                         
## GSM190153          54675  Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
## GSM254157          54675  Reanalyzed by: GSE60486 Reanalyzed by: GSE119087
##           age:ch1 cilia length:ch1 copd status:ch1
## GSM190150      34             <NA>            <NA>
## GSM190153      45             <NA>            <NA>
## GSM254157      45             <NA>            <NA>
##           department of genetic medicine id:ch1 ethnic group:ch1 ethnicity:ch1
## GSM190150                                  <NA>            black          <NA>
## GSM190153                                  <NA>         hispanic          <NA>
## GSM254157                                  <NA>            white          <NA>
##           serum 25-oh-d:ch1 sex:ch1    smoking status:ch1      group
## GSM190150              <NA>       M            non-smoker non.smoker
## GSM190153              <NA>       F            non-smoker non.smoker
## GSM254157              <NA>       M smoker, 23 pack-years     smoker
# The phenotypic data is terrible.
# This is filtered down to the samples that were included.
# I will first try to clean up the phenotypic data.

# So I think the features I want to keep will be:
# Dates of submission/updates etc, sex, ethnicity, smoking status
# Keep the columns that might contain data of interest, which will need to be cleaned up.

# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("geo_accession","status","submission_date","last_update_date","characteristics_ch1","characteristics_ch1.1","characteristics_ch1.2","characteristics_ch1.3","age:ch1","cilia length:ch1","ethnic group:ch1","ethnicity:ch1","serum 25-oh-d:ch1","sex:ch1","smoking status:ch1","group")

# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)

phenotypic_data <- phenotypic_data[,c(indexes)]

# Now I need to parse out sex, ethnicity, smoking status, and age, vitamin D, pack years.

#Rename "group" as "smoking status"
names(phenotypic_data)[16] <- "smoking_status"

## Grabbing ethnicity values from the columns ##
# Initialize a new column "ethnicity" with NA values
phenotypic_data$ethnicity <- NA

# Function to find 'eth' in a row and return the corresponding value
find_ethnicity <- function(row) {
  eth_column <- which(grepl('eth', row))
  if (length(eth_column) > 0) {
    return(row[eth_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "ethnicity" column
phenotypic_data$ethnicity <- apply(phenotypic_data, 1, find_ethnicity)

## Grabbing sex values from the columns ##
# Initialize a new column "sex" with NA values
phenotypic_data$sex <- NA

# Function to find 'sex' in a row and return the corresponding value
find_sex <- function(row) {
  sex_column <- which(grepl('sex', row))
  if (length(sex_column) > 0) {
    return(row[sex_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "sex" column
phenotypic_data$sex <- apply(phenotypic_data, 1, find_sex)


## Grabbing pack_years values from the columns ##
# Initialize a new column "pack_years" with NA values
phenotypic_data$pack_years <- NA

# Function to find 'pack_years' in a row and return the corresponding value, but just the first instance
find_pack_years <- function(row) {
  pack_years_column <- which(grepl('pack', row))
  if (length(pack_years_column) > 0) {
    return(row[pack_years_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "pack_years" column
phenotypic_data$pack_years <- apply(phenotypic_data, 1, find_pack_years)
#unlist the column
phenotypic_data$pack_years <- unlist(phenotypic_data$pack_years )



## Grabbing age values from the columns ##
# Initialize a new column "age" with NA values
phenotypic_data$age <- NA

# Function to find 'age' in a row and return the corresponding value
find_age <- function(row) {
  age_column <- which(grepl('age', row))
  if (length(age_column) > 0) {
    return(row[age_column])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "age" column
phenotypic_data$age <- apply(phenotypic_data, 1, find_age)


## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA

# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
  vitamin_d_column <- which(grepl('vitamin', row))
  if (length(vitamin_d_column) > 0) {
    return(row[vitamin_d_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)

## Grabbing vitamin_d values from the columns ##
# Initialize a new column "vitamin_d" with NA values
phenotypic_data$vitamin_d <- NA

# Function to find 'vitamin_d' in a row and return the corresponding value, first instance
find_vitamin_d <- function(row) {
  vitamin_d_column <- which(grepl('vitamin', row))
  if (length(vitamin_d_column) > 0) {
    return(row[vitamin_d_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "vitamin_d" column
phenotypic_data$vitamin_d <- apply(phenotypic_data, 1, find_vitamin_d)

## Grabbing cilia values from the columns ##
# Initialize a new column "cilia_length" with NA values
phenotypic_data$cilia_length <- NA

# Function to find 'cilia' in a row and return the corresponding value, first instance
find_cilia <- function(row) {
  cilia_column <- which(grepl('cilia', row))
  if (length(cilia_column) > 0) {
    return(row[cilia_column[1]])
  } else {
    return(NA)
  }
}
# Apply the function row-wise to populate the "cilia" column
phenotypic_data$cilia_length <- apply(phenotypic_data, 1, find_cilia)



## Now cut out the messy columns
phenotypic_data <- phenotypic_data[,-c(5:15)]

## Remove unnecessary prefix info
phenotypic_data$ethnicity <- gsub(".*: ", "", phenotypic_data$ethnicity )
phenotypic_data$age <- gsub(".*: ", "", phenotypic_data$age)
phenotypic_data$sex <- gsub(".*: ", "", phenotypic_data$sex)
phenotypic_data$vitamin_d <- gsub(".*: ", "", phenotypic_data$vitamin_d)
phenotypic_data$cilia_length <- gsub(".*: ", "", phenotypic_data$cilia_length)

phenotypic_data$pack_years<- gsub(".*, ", "", phenotypic_data$pack_years)
phenotypic_data$pack_years<- gsub("pack-years", "", phenotypic_data$pack_years)


# Reformat the submission dates to be sortable

phenotypic_data <- phenotypic_data %>%
  mutate(submission_date = ifelse(submission_date == "Dec 20 2012", "2012-12-20", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jan 03 2008", "2008-01-08", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jan 31 2013", "2013-01-31", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jun 03 2010", "2010-06-03", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Jun 13 2008", "2008-06-13", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "May 17 2007", "2007-05-17", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Nov 08 2013", "2013-11-08", submission_date)) %>%
  mutate(submission_date = ifelse(submission_date == "Nov 10 2014", "2014-11-10", submission_date))

head(phenotypic_data, n = 3)
##           geo_accession                status submission_date last_update_date
## GSM190150     GSM190150 Public on Dec 16 2008      2007-05-17      Aug 28 2018
## GSM190153     GSM190153 Public on Jun 17 2008      2007-05-17      Aug 28 2018
## GSM254157     GSM254157 Public on Jun 17 2008      2008-01-08      Aug 28 2018
##           smoking_status ethnicity sex pack_years age vitamin_d cilia_length
## GSM190150     non.smoker     black   M       <NA>  34      <NA>         <NA>
## GSM190153     non.smoker  hispanic   F       <NA>  45      <NA>         <NA>
## GSM254157         smoker     white   M        23   45      <NA>         <NA>

1.3.2: Plot PCA and use phenotypic information to look for sources of batch effect/variation, and correct for these with ComBat

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("non-smoker","smoker"))
levels(gs) <- groups
gset$group <- gs


## Plot PCA 1 ##
colz <- as.numeric(as.factor(gs)) # Get color values from group

plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127",
        col = colz,
        pch = 1
)
legend("bottom", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

## We have 4 definite clusters that are not based on smoking status. 
## As such, it is a good idea to check the table of sample phenotypic information to look for sources of variation between samples.


pointz <- as.numeric(as.factor(phenotypic_data$submission_date<= "2010-06-03")) # Get point shape values from date of submission: split into 2010 and earlier, post-2010]

## Plot PCA with date information##
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz
        #labels = gset$group
)
legend("bottom", 
       legend = c("Smoker", "Non-mokers", 
                  "2010 and Prior", "Post-2010"), 
       col = c("red", "black", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1, 1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )                              # Box type: 'n' removes border

# Clearly the source of batch effect in PC1 is submission date post-2010.
# Note: I found that the split was at 2010 by doing a bit of playing around with other clustering methods, not shown here.

# First batch correction (submission date)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.3.3
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Loading required package: BiocParallel
library(limma)

# Making a batch vector
submission_post_2010_batch <- ifelse(phenotypic_data$submission_date < as.Date("2012-01-01"), 1, 2)

# Adjust the expression matrix for submission date batch effect
exprs_matrix_combat <- ComBat(dat=exprs(gset), batch=submission_post_2010_batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Plot PCA for expression values after first batch correction ##
date_corrected_PCA <- plotMDS(exprs_matrix_combat,
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127, corrected for submission date",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz

)
legend("bottom",
       legend = c("Smoker", "Non-smoker"),
       col = c("red", "black"),
       pch = c(15, 15)
       #title = "Smoking status"
       )

## Some evidence that second source of variation could be due to sex (but only 11/182 samples have sex labels):
plotMDS(exprs_matrix_combat,
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127, corrected for submission date",
        col = colz, # Colors smokers red and nonsmokers black
        #pch = pointz2 # Using separate shapes for all submission dates
        labels = phenotypic_data$sex
)
legend("bottom",
       legend = c("M = Male", "F = Female", "Smoker", "Non-smoker"),
       col = c("black", "black", "red", "black"),
       pch = c(NA, NA, 15, 15)
       #title = "Smoking status"
       )

## Samples are divided by sex, but 11/182 samples is not enough to draw a conclusion here.

## Second correction for unknown source of variation using ComBat: ##

# Assign batch labels based on the first dimension from MDS (equivalent to PC1), since the dividing line for the batches lies at 0
unknown_batch_labels <- ifelse(date_corrected_PCA$x < 0, 1, 2)

# Do a second batch correction
exprs_matrix_combat_2 <- ComBat(dat=exprs_matrix_combat, batch=unknown_batch_labels, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# View PCA plot
plotMDS(exprs_matrix_combat_2,
        gene.selection = "common",
        main = "PCA for CS vs NS GSE63127 after 2 ComBat corrections",
        col = colz, # Colors smokers red and nonsmokers black
        pch = pointz
        #labels = gset$group
)
legend("topleft", 
       legend = c("Smoker", "Non-smoker", 
                  "2010 and Prior", "Post-2010"), 
       col = c("red", "black", "black", "black"), # Colors: only for smoking status
       pch = c(15, 15, 2, 1),                   # Shapes: 2 = triangle, 1 = circle
       pt.cex = c(1, 1, 1, 1),             # Adjust size for better visibility
       text.col = "black",                     # Text color
#       bty = "n"
       )  

## Now PC1 corresponds quite well to smoking status after the two ComBat corrections.

1.4 Differential expression analysis (limma with vooma)

# Finish setting up the design matrix
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)


## Crucial bit: Replace the expression values in gset with the batch corrected ones ##
exprs(gset) <- as.matrix(exprs_matrix_combat_2)

# calculate precision weights and show plot of mean-variance trend
v <- vooma(gset, design, plot=T)

# OR weights by group
# v <- voomaByGroup(gset, group=groups, design, plot=T, cex=0.1, pch=".", col=1:nlevels(gs))
v$genes <- fData(gset) # attach gene annotations

# fit linear model
fit  <- lmFit(v)

# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[2], groups[1], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)

# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)

tT <- subset(tT, select=c("ID","Gene.symbol","logFC","adj.P.Val"))

1.5 Basic filtering of DEGs (unlabelled, duplicates)

GSE63127_CS_NS_limma <- tT %>%
  filter(Gene.symbol != "") %>% # Remove blank gene symbols
  group_by(Gene.symbol) %>%
  slice_min(adj.P.Val, with_ties = TRUE) %>% 
  # For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
  ungroup()
head(GSE63127_CS_NS_limma)
## # A tibble: 6 × 4
##   ID          Gene.symbol  logFC adj.P.Val
##   <chr>       <chr>        <dbl>     <dbl>
## 1 229819_at   A1BG        -0.106    0.481 
## 2 232462_s_at A1BG-AS1     0.531    0.0224
## 3 220951_s_at A1CF         0.302    0.123 
## 4 1558450_at  A2M          0.110    0.453 
## 5 1564139_at  A2M-AS1     -0.138    0.0724
## 6 1553505_at  A2ML1        0.145    0.636
# Checking for ties
ties <- GSE63127_CS_NS_limma %>%
  group_by(Gene.symbol) %>%
  filter(n() > 1) %>%
  ungroup()
print(ties) # No ties
## # A tibble: 0 × 4
## # ℹ 4 variables: ID <chr>, Gene.symbol <chr>, logFC <dbl>, adj.P.Val <dbl>
nrow(GSE63127_CS_NS_limma)
## [1] 22189

1.8 Visualization of DEGs (volcano plot)

FDR_cutoff_A1 <- 0.001
log2FC_cutoff_A1 <- 0.5

v_A1 <- EnhancedVolcano::EnhancedVolcano(
  toptable = GSE63127_CS_NS_limma,
  lab = GSE63127_CS_NS_limma$Gene.symbol,
  x = "logFC", # "mean difference" is estimate here
  y = "adj.P.Val", 
 # pCutoffCol = 'min_smoothed_fdr',
  xlab = "log2FC",
  ylab = "-log10(FDR)",
  title = "A1 DEGs",
  subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_A1, "    FDR cutoff: ", FDR_cutoff_A1),
  caption = paste0("Total = ", nrow(GSE63127_CS_NS_limma[abs(GSE63127_CS_NS_limma$logFC)>log2FC_cutoff_A1 &  GSE63127_CS_NS_limma$adj.P.Val < FDR_cutoff_A1,]), " significant DEGs meeting cutoffs"),
  col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
  legendPosition = "bottom",
  labSize = 4,
  max.overlaps = 5,
  drawConnectors = TRUE,
  arrowheads = FALSE,
  pCutoff = FDR_cutoff_A1,
  FCcutoff = log2FC_cutoff_A1,
  gridlines.minor = FALSE,
  gridlines.major = FALSE,
  xlim = c(-3, 6)
)

v_A1
## Warning: ggrepel: 687 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

1.7 Save outputs

# Change date suffix as appropriate if changes are made
#write.table(GSE63127_CS_NS_limma, "../2_Outputs/1_Airway_DEGs/GSE63127_CS_NS_limma_20241204.txt", sep = '\t')

1.8 Extra checks

2024/11/07: Comparing the DEGs list when quantile normalization is used vs not used:

GSE63127_CS_NS_GEO2R_limma_sig_quantile <- read.table("../2_Outputs/GSE63127_CS_NS_GEO2R_limma_sig_20241016.txt", header = TRUE)
GSE63127_CS_NS_GEO2R_limma_sig_no_quantile <- GSE63127_CS_NS_limma[GSE63127_CS_NS_limma$adj.P.Val<0.05,]

# Compare results
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
## 
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
## 
##     scat
venn <- venn.diagram(
  list(
    DEGs_no_quantile = GSE63127_CS_NS_GEO2R_limma_sig_no_quantile$Gene.symbol,
    DEGs_quantile = GSE63127_CS_NS_GEO2R_limma_sig_quantile$Gene.symbol
  ),
  filename = NULL
)
# Display the diagram
grid.newpage()
grid.draw(venn)

The lists agree quite well. The list without quantile normalization is larger. Quantile normalization could be over-normalization and mask some variation. The PCA plots still look good without quantile normalization. I will elect to go forward without quantile normalization. I will have to apply the same to all the other airway datasets for the meta-analysis bit (i.e. do not do quantile normalization for them).


2. Differential expression analysis of TCGA-LUAD tumor vs normal tissue (T-E)

2.1.1 Loading dataset

library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)

query <- GDCquery(project = "TCGA-LUAD",
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification",
                  sample.type = c("Primary Tumor", "Solid Tissue Normal"),
                  workflow.type = "STAR - Counts")

GDCdownload(query)

data <- GDCprepare(query)

2.1.2 Formatting as tumor-normal pairs

counts <- as.data.frame(assay(data))  # Extracting the count matrix (these are supposedly raw counts)
#head(counts)  # Viewing the first few rows (genes) and columns (samples)

gene_info <- as.data.frame(rowData(data))
head(gene_info)  # Preview the first few genes and their annotations
##                    source type score phase            gene_id      gene_type
## ENSG00000000003.15 HAVANA gene    NA    NA ENSG00000000003.15 protein_coding
## ENSG00000000005.6  HAVANA gene    NA    NA  ENSG00000000005.6 protein_coding
## ENSG00000000419.13 HAVANA gene    NA    NA ENSG00000000419.13 protein_coding
## ENSG00000000457.14 HAVANA gene    NA    NA ENSG00000000457.14 protein_coding
## ENSG00000000460.17 HAVANA gene    NA    NA ENSG00000000460.17 protein_coding
## ENSG00000000938.13 HAVANA gene    NA    NA ENSG00000000938.13 protein_coding
##                    gene_name level    hgnc_id          havana_gene
## ENSG00000000003.15    TSPAN6     2 HGNC:11858 OTTHUMG00000022002.2
## ENSG00000000005.6       TNMD     2 HGNC:17757 OTTHUMG00000022001.2
## ENSG00000000419.13      DPM1     2  HGNC:3005 OTTHUMG00000032742.2
## ENSG00000000457.14     SCYL3     2 HGNC:19285 OTTHUMG00000035941.6
## ENSG00000000460.17  C1orf112     2 HGNC:25565 OTTHUMG00000035821.9
## ENSG00000000938.13       FGR     2  HGNC:3697 OTTHUMG00000003516.3
sample_info <- as.data.frame(colData(data))
#head(sample_info)  # Preview sample metadata

table(sample_info$sample_type)  # Summarize sample types (Tumor vs. Normal)
## 
##       Primary Tumor Solid Tissue Normal 
##                 539                  59
# Extract just the normal sample info
sample_info_normal <- sample_info[sample_info$definition=="Solid Tissue Normal",]

# Look for tumor samples with normal matches from same patients
sample_info_tumor <- sample_info %>%
  filter(patient %in% sample_info_normal$patient) %>%
  filter(definition == "Primary solid Tumor")

# The tumor list is longer -- check out duplicate patient IDs in this list
sample_info_tumor_dups <- sample_info_tumor %>%
  group_by(patient) %>%
  filter(n() > 1) %>%
  ungroup()

unique(sample_info_tumor_dups$patient) # There are 6 patients with multiple tumor samples
## [1] "TCGA-44-6147" "TCGA-44-2662" "TCGA-44-5645" "TCGA-44-6146" "TCGA-44-2668"
## [6] "TCGA-44-2665"
sample_info_tumor_dups_FFPE <- sample_info_tumor_dups[sample_info_tumor_dups$is_ffpe,] # OK the difference is the FFPE status.
# It seems these are the only 6 patients in the group who have FFPE samples available.

# I guess I will make the decision to keep the 6 FFPE samples regardless. Not sure if that's the right choice but I'll do it for now.

# Get the non-FFPE duplicate patient sample info
sample_info_tumor_dups_non_FFPE <- sample_info_tumor_dups[!sample_info_tumor_dups$is_ffpe,]
# Remove these IDs from the main tumor sample info
sample_info_tumor <- sample_info_tumor %>% filter(! barcode %in% sample_info_tumor_dups_non_FFPE$barcode)

# There is 1 normal sample with no matching tumor sample it seems, so remove that
sample_info_normal <- sample_info_normal %>% filter(patient != "TCGA-44-6144")

# Make the matched tumor-normal sample table
sample_info_matched_T_NM <- rbind(sample_info_tumor, sample_info_normal)[order(c(seq_len(nrow(sample_info_tumor)), seq_len(nrow(sample_info_normal)))), ]
sample_info_matched_T_NM <- sample_info_matched_T_NM %>% 
  dplyr::select(-treatments) %>% # Removing treatments column since it is in the form of a list and has no info
  arrange(., sample_type_id) %>% # First sort by tumor vs normal
  arrange(., patient) # arrange by patient to get the tumor normal pairs

## Modifying the counts table for tumor-normal matched data ##

# Keep the counts columns of sample labels that are in the T-NM matched info
sample_barcodes <- as.character(sample_info_matched_T_NM$barcode)
counts_matched_T_NM <- counts %>%
  dplyr::select(all_of(sample_barcodes))

# Rename with sample label instead of sample barcode
names(counts_matched_T_NM) <- sample_info_matched_T_NM$sample

2.2.1 Quality control checks

library(dplyr)
library(edgeR)
## Warning: package 'edgeR' was built under R version 4.3.2
# Checking distribution of the whole counts table
hist(as.matrix(counts_matched_T_NM)) # whoa

hist(log2(as.matrix(counts_matched_T_NM))) # Still not normal at all

# Checking distribution of just tumor samples
# counts_matched_T <- counts_matched_T_NM %>%
#   dplyr::select(seq(1, ncol(counts_matched_T_NM), by = 2))
# hist(log2(as.matrix(counts_matched_T))) # Equally bad distribution, why is it the same though??
# 
# # Checking distribution of just normal samples
# counts_matched_NM <- counts_matched_T_NM %>%
#   dplyr::select(seq(2, ncol(counts_matched_T_NM), by = 2))
# hist(log2(as.matrix(counts_matched_NM))) # Equally bad distribution, why is it the same though????


boxplot(counts_matched_T_NM) # Boxplots for all counts looks crazy, but apparently this is common in RNAseq data and is accounted for by DESeq2 by a size faxctor approach (can back up later with a source)

## PCA to check for tumor-normal separation
colz <- as.numeric(as.factor(rep(c(1,0), length(counts_matched_T_NM)/2))) # Get color values from group
plotMDS(counts_matched_T_NM,
        gene.selection = "common",
        main = "PCA for TCGA-LUAD expression",
        col = colz,
        pch = 1
)
legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("red", "black"), # Colors: only for smoking status
       pch = c(15, 15),                
       text.col = "black"
       )  

# Separate but not very good separation, 1 definite outlier.
# To find the outlier, plotting PCA with sample names
plotMDS(counts_matched_T_NM,
        gene.selection = "common",
        main = "PCA for TCGA-LUAD expression",
        col = colz
        #pch = 1
)
legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("red", "black"), # Colors: only for smoking status
       pch = c(15, 15),                
       text.col = "black"
       )  

# Checking out this outlier, TCGA-38-4626-01A
hist(log2(counts_matched_T_NM$`TCGA-38-4626-01A`)) # Not obvious why it's an outlier, but must somehow be really normal-like?

##  Making a dendrogram to see if the same outlier is found
sample_dist <- dist(t(counts_matched_T_NM))  # Transpose the matrix to calculate distances between samples
hc <- hclust(sample_dist) #Perform hierarchical clustering
plot(hc, main = "Dendrogram of Samples", xlab = "", sub = "", cex = 0.8) # Plot the dendrogram

2.2.2 Acting on quality control checks: Remove outlier and its pair

# Remove the 1 most obvious outlier and its pair:

# TCGA-38-4626-01A, TCGA-38-4626-11A

counts_matched_T_NM <- counts_matched_T_NM %>% dplyr::select(-c("TCGA-38-4626-01A","TCGA-38-4626-11A"))

# counts_matched_T_NM <- counts_matched_T_NM %>% dplyr::select(-c("TCGA.38.4626.01A","TCGA.38.4626.11A"))
# Version after reading it in

## PCA to check for tumor-normal separation with outlier removed
colz2 <- as.numeric(as.factor(rep(c(0,1), length(counts_matched_T_NM)/2))) # Get color values from group
plotMDS(counts_matched_T_NM,
        gene.selection = "common",
        main = "PCA for TCGA-LUAD expression after outlier removal",
        col = colz2,
        pch = 1
)
legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("red", "black"), # Colors: only for smoking status
       pch = c(15, 15),                
       text.col = "black"
       )  

## Saving this version of the T-NM matched counts
#write.table(counts_matched_T_NM, "../2_Outputs/3_Tumor_expression/TCGA_LUAD_counts_matched_T_NM_20241125.txt")

The matrices have messy boxplots and histograms, but since I am using the signed-rank test, it does not suppose require normally distributed data, so I have decided to go with this raw counts matrix for now.

2024/12/04 In an earlier version of the script I used a Wilcoxon signed-rank test since I read in a paper that a Wilcoxon-based method was the most accurate method (Li et al. Genome Biology (2022) 23:79)

I wrote some code based on below: Source code: https://github.com/xihuimeijing/DEGs_Analysis_FDR/blob/main/scripts/DEGs.R Accessed 2023/08/26

Tutorial: https://rpubs.com/LiYumei/806213 Accessed 2023/08/31

Unlike the tutorial, here I performed a signed-rank test rather than a rank-sum test, as the samples are not independent (they are matched tumor and normal samples).

However, I later decided to switch to DESeq2 since this is the preferred, standard method for large sample sizes in RNAseq (find a good source to back it up).

It seems that the Wilcoxon signed-rank test may be much more suitable for assessing a handful of genes rather than whole-transcriptome analysis. DESeq2 is more typically used for the latter, despite the finding of the publication listed above. I will compare the results using DESeq2.

Below I show the DESeq2 version.

Note that I include a filtering step to remove genes with low counts: “Filter out rows with less than 10 total counts in the smallest sample group size (114/2 = 57)” The smallest group (tumor or normal) has 57 members, so this filters out any genes that cannot reach a sum of 10 counts using 57 samples.

2.3 Differential expression analysis using DESeq2, with filter for low counts

library(DESeq2)
## Warning: package 'DESeq2' was built under R version 4.3.3
library(apeglm)

readCount <- as.matrix(counts_matched_T_NM)

# Removing the outlier samples from the sample info and setting rownames as sample names
sample_info_matched_T_NM <- sample_info_matched_T_NM %>% 
  dplyr::filter(., sample != c("TCGA-38-4626-01A","TCGA-38-4626-11A"))
rownames(sample_info_matched_T_NM) <- sample_info_matched_T_NM$sample

# Checking the sample names are in the same order
all(colnames(readCount)==rownames(sample_info_matched_T_NM))
## [1] TRUE
# Preparing and performing DESeq
dds <- DESeqDataSetFromMatrix(countData = readCount,
                              colData = sample_info_matched_T_NM,
                              design= ~ definition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
# Filter out rows with less than 10 total counts in the smallest sample group size (114/2 = 57)
keep <- rowSums(counts(dds) >= 10) >= 57
dds <- dds[keep,]

# Set the reference level as the normal tissue - 
dds$definition <- relevel(dds$definition, ref = "Solid Tissue Normal")

# Perform differential expression analysis
dds <- DESeq(dds)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 754 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
resultsNames(dds) # lists the coefficients
## [1] "Intercept"                                            
## [2] "definition_Primary.solid.Tumor_vs_Solid.Tissue.Normal"
# res <- results(dds, 
#                name="definition_Primary.solid.Tumor_vs_Solid.Tissue.Normal"
#                )

# or to shrink log fold changes association with condition:
res <- lfcShrink(dds, 
                coef="definition_Primary.solid.Tumor_vs_Solid.Tissue.Normal", 
                type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
summary(res)
## 
## out of 19062 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 9099, 48%
## LFC < 0 (down)     : 5225, 27%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_table <- as.data.frame(cbind( log2FC = res$log2FoldChange, FDR = res$padj, base_mean = res$baseMean))

### Replace ensembl IDs with gene names
res_table <- res_table %>% arrange(., rownames(.))
gene_info_sorted <- gene_info %>% 
  arrange(., gene_id) %>%
  filter(gene_id %in% rownames(res_table))

res_table$gene <- gene_info_sorted$gene_name

# Rename results table to be more descriptive
TE_T_NM_DEseq2_DEGs <- res_table

2.4 Visualization of DEGs from DESEq2 calculation (volcano plot)

log2FC_cutoff_TE <- 1.5
FDR_cutoff_TE <- 0.001

v_TE <- EnhancedVolcano::EnhancedVolcano(
  toptable = TE_T_NM_DEseq2_DEGs,
  lab = TE_T_NM_DEseq2_DEGs$gene,
  x = "log2FC",
  y = "FDR", 
 # pCutoffCol = 'min_smoothed_fdr',
  xlab = "log2FC",
  ylab = "-log10(FDR)",
  title = "TE DEGs",
  subtitle = paste0("log2FC cutoff: ", log2FC_cutoff_TE, "   FDR cutoff: ", FDR_cutoff_TE),
  caption = paste0("Total = ", nrow(TE_T_NM_DEseq2_DEGs[abs(TE_T_NM_DEseq2_DEGs$log2FC)>log2FC_cutoff_TE & TE_T_NM_DEseq2_DEGs$FDR < FDR_cutoff_TE,]), " significant DEGs meeting cutoffs"),
  col = c("grey30", "mediumpurple2", "royalblue", "orange2"),
  legendPosition = "bottom",
  labSize = 3,
  max.overlaps = 10,
  drawConnectors = TRUE,
  arrowheads = FALSE,
  pCutoff = FDR_cutoff_TE,
  FCcutoff = log2FC_cutoff_TE,
  gridlines.minor = FALSE,
  gridlines.major = FALSE,
  #xlim = c(-3, 6)
  ylim = c(0,10)
)

v_TE
## Warning: ggrepel: 395 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 2.5 Checks: Comparing genes output from DESeq2 calculation with genes output from Signed-rank test calculation

library(ggvenn)
## Loading required package: scales
DGE_LUAD_T_NM_signed_rank_sig <- read.table("../2_Outputs/4_Tumor_DEGs/DGE_LUAD_T_NM_signed_rank_sig_20241107.txt")

# Define the gene lists
DESeq2_list <- TE_T_NM_DEseq2_DEGs[TE_T_NM_DEseq2_DEGs$FDR<0.05,]$gene
signed_rank_list <- DGE_LUAD_T_NM_signed_rank_sig$Gene

library(ggvenn)

# Define the gene lists in a named list
gene_lists <- list(
  "DESeq2_list" = DESeq2_list,
  "signed_rank_list" = signed_rank_list
)

# Create the Venn diagram
ggvenn::ggvenn(gene_lists, fill_color = c("blue", "red"))

## ~80% agreement between the lists is really good. Since there is better justification for using DESeq2, I should probably use that one.

Saving outputs

# Change date suffix as appropriate if modifications are made
#write.table(TE_T_NM_DEseq2_DEGs, "../2_Outputs/4_Tumor_DEGs/TE_T_NM_DEseq2_DEGs_20241204.txt", sep = '\t')

3. Differential methylation analysis of TCGA-LUAD tumor vs normal tissue (T-M)

I downloaded this level 3 methylation 450k data from cBioPortal, from TCGA Lung Adenocarcinoma (Firehose Legacy) https://www.cbioportal.org/study/summary?id=luad_tcga (Accessed 2023/08/29)

Note that this provides gene information but not probe information. According to the metadata, “Methylation (HM450) beta-values for genes in 492 cases. For genes with multiple methylation probes, the probe most anti-correlated with expression.”

2024/12/04:

I did a lot of work trying to do the analysis starting from probe level information, but ultimately decided to stick with the cbioportal pre-processed table for now. I may want to revisit the probewise analysis later once I am more confident in the output.

I decided I should switch from wilcoxon sign-rank test to limma because this is more typiucally used for large and complex datasets like TCGA with differential methylation analysis (look for source). Similar justification to DESeq2 for expression.

3.1 Loading dataset downloaded from cbioportal

data_methylation_hm450_tumor <- read.table("../../Former_Smokers_Aim_2/1_TCGA_LUAD_multiomics/0_Unpaired_input_tables/data_methylation_hm450.txt", header=TRUE, fill=TRUE)

data_methylation_hm450_normal <- read.table("../../Former_Smokers_Aim_2/1_TCGA_LUAD_multiomics/0_Unpaired_input_tables/data_methylation_hm450_normals.txt", header=TRUE, fill=TRUE)

3.2 Formatting dataset

3.2.1 Formatting counts in tumor-normal pairs

allIDs_tumor  <- colnames(data_methylation_hm450_tumor)
allIDs_normal <- colnames(data_methylation_hm450_normal)

#Listing IDs of tumors that have matched normals by changing the tissue ID to the "tumor" identifier, "01", for matching purposes.
IDs_tumor_with_matches <-gsub(".11",".01", allIDs_normal)

#Make a table of the methylation data for tumor samples only with matching normal data.
#
data_methylation_hm450_tumor_with_matches <- data_methylation_hm450_tumor %>%
  dplyr::select(any_of(IDs_tumor_with_matches))

#Make a table of the methylation data for normal samples only with matching tumor data.
# Note that 3 of the normal samples don't have a matching tumor sample:
#`TCGA.44.2655.01`, `TCGA.44.2659.01`, and `TCGA.44.2662.01` don't exist.
data_methylation_hm450_normal_with_matches <- data_methylation_hm450_normal %>%
  dplyr::select(-c('TCGA.44.2655.11', 'TCGA.44.2659.11','TCGA.44.2662.11'))

#Make a combined table of matched tumor-normal samples.
data_methylation_hm450_tumor_normal_matched <- cbind(data_methylation_hm450_tumor_with_matches, data_methylation_hm450_normal_with_matches)[order(c(1:31,1:31))]

#Remove duplicate gene ID column and the entrez ID columns
data_methylation_hm450_tumor_normal_matched <- data_methylation_hm450_tumor_normal_matched[,-c(1,3,4)]

3.2.2 Giving suffixes to duplicate genes

# I want to make the gene names into row names, but I cannot because some gene names appear twice.
# So, I will rename them with indexes _1 and _2 and figure out  why they appeared twice later.

#Checking rows of the gene names with duplicates:
checking_dups <- data_methylation_hm450_tumor_normal_matched[data_methylation_hm450_tumor_normal_matched$Hugo_Symbol.1 %in% c("AGER", "CX3CR1", "F2R", "GADL1", "GCOM1", "KLK10", "PALM2AKAP2", "QSOX1", "RCC1"),]

# I see that these are not identical rows - the methylation values are different. So, I will go ahead and add indexes.
checking_dups <- checking_dups[order(checking_dups$Hugo_Symbol.1),]#Sort by gene name

checking_dups <- cbind(rownames(checking_dups), checking_dups[,1]) #Make table of just the gene names and row names of the original file

checking_dups[,2] <- paste(checking_dups[,2],1:2,sep="_") # Add a suffix to the gene names

#Replace the gene names in the T-NM matched file with the suffixed gene names
data_methylation_hm450_tumor_normal_matched[checking_dups[,1],1] <- checking_dups[,2]

#Now that there are no longer duplicates, make the gene names column into the row names and remove the gene names column.
rownames(data_methylation_hm450_tumor_normal_matched) <- data_methylation_hm450_tumor_normal_matched[,1]
data_methylation_hm450_tumor_normal_matched <- data_methylation_hm450_tumor_normal_matched[,2:59]

3.3 Quality control checks and conversion to M values for differential analysis

hist(as.matrix(data_methylation_hm450_tumor_normal_matched))

boxplot(data_methylation_hm450_tumor_normal_matched)

max(!is.na(data_methylation_hm450_tumor[3:length(data_methylation_hm450_tumor_normal_matched)]))
## [1] 1
min(!is.na(data_methylation_hm450_tumor[3:length(data_methylation_hm450_tumor_normal_matched)]))
## [1] 0
# Beta values normally have a bimodal distribution so it's not really unusual to see this

## Conversion from beta to M values ##

# Shorter name for convenience
methyl_beta <- data_methylation_hm450_tumor_normal_matched
# Convert to M values
methyl_M=log2(data_methylation_hm450_tumor_normal_matched/(1-data_methylation_hm450_tumor_normal_matched))
hist(as.matrix(methyl_M), breaks = 100) ## Closer to normal distribution though still definitely not a perfect one

### PCA checks ###

## PCA to check for tumor-normal separation with outlier removed
colz <- rep(c("red","black"), length(methyl_M)/2) # Get color values from T or NM group
plotMDS(methyl_M,
        gene.selection = "common",
        main = "PCA for TCGA-LUAD methylation T-NM matched",
        col = colz,
        pch = 1
)

legend("bottomleft", 
       legend = c("Tumor", "Normal"), 
       col = c("red", "black"),
       pch = 15
       )  

## Very good tumor-normal separation

3.4 Differential expression analysis using limma

# Create a design matrix
groups <- rep(c("Tumor","Normal"), length(methyl_M)/2)
design <- model.matrix(~0 + groups)

# "Condition" should indicate Tumor/Normal status
colnames(design) <- c("Normal", "Tumor")

# Fit the model
fit <- lmFit(methyl_M, design)

# Define contrasts (Tumor vs Normal)
contrast_matrix <- makeContrasts(Tumor - Normal, levels = design) 
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2)

# Get top differentially methylated probes
tT <- topTable(fit2, adjust = "BH", number = Inf)

# Giving more descriptive name and colnames to list
TCGA_LUAD_DMG <- tT
TCGA_LUAD_DMG$Gene <- rownames(TCGA_LUAD_DMG)
rownames(TCGA_LUAD_DMG) <- NULL
TCGA_LUAD_DMG <- TCGA_LUAD_DMG %>%
  dplyr::select(., Gene, logFC, adj.P.Val) %>%
  dplyr::rename(., log2FC = logFC, FDR = adj.P.Val)

# Giving a yet more descriptive name
TCGA_LUAD_limma_DMG <- TCGA_LUAD_DMG

nrow(TCGA_LUAD_limma_DMG)
## [1] 16565
### Saving output
#write.table(TCGA_LUAD_limma_DMG, "../2_Outputs/5_Tumor_DMGs/TCGA_LUAD_limma_DMG_20241203.txt", sep = '\t')

3.5 Extra check of the wilcoxon signed-rank output compared with the limma output for T-M

library(ggvenn)

DMeth_LUAD_T_NM_hm450_sig <- read.table("../2_Outputs/5_Tumor_DMGs/DMeth_LUAD_T_NM_hm450_sig_20241108_PM.txt")

# Define the gene lists
limma_list <- TCGA_LUAD_DMG[TCGA_LUAD_DMG$FDR<0.05,]$Gene # Since other list has 0.05 cutoff, chose that
signed_rank_list <- DMeth_LUAD_T_NM_hm450_sig$Gene

library(ggvenn)

# Define the gene lists in a named list
gene_lists <- list(
  "limma_list" = limma_list,
  "signed_rank_list" = signed_rank_list
)

# Create the Venn diagram
ggvenn::ggvenn(gene_lists, fill_color = c("blue", "red"))

# 75% agreement seems pretty good

comparing_DMG_limma_vs_signed_rank <- TCGA_LUAD_DMG[TCGA_LUAD_DMG$FDR<0.05,] %>%
  dplyr::inner_join(., DMeth_LUAD_T_NM_hm450_sig, by = "Gene", suffix = c("_limma", "_signed_rank"))

## Looking back at the wilcoxon sign rank script, I see I actually used beta values, which might explain the differences in log2FC values...but there is agreement in sign and some agreement in magnitude at least.

4. Differential expression analysis of reference reference “persistent” airway current vs former vs never smoker dataset (A2)

4.1 Loading dataset

# load series and platform data from GEO

gset <- getGEO("GSE7895", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE7895_series_matrix.txt.gz
if (length(gset) > 1) idx <- grep("GPL96", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group membership for all samples
gsms <- paste0("22222222222222222222200000000000000000000000000000",
               "00000000000000000000000111111111111111111111111111",
               "1111")
sml <- strsplit(gsms, split="")[[1]]

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values

# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("current_smoker","former_smoker","never_smoker"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)

gset <- gset[complete.cases(exprs(gset)), ] # skip missing values

4.2 Quality control checks and normalization

2024/12/04: My real reason for using quantile normalization is that I obtained more DEGs when I did it this way. But to give a better justification I could point out that the boxplots look quite variable and it seemed appropriate to normalize the expression distributions.

4.2.1 Initial checks (histogram, boxplot, PCA), and quantile normalization

## Make histograms and boxplots to check if the data is log-transformed and needs quantile normalization ##

hist(as.matrix(exprs(gset))) # Values range 1-15, and 1 big peak around 3. 

boxplot(exprs(gset)) # Same range of values, with similar-looking ranges, but not exactly the same

# Narrow range, therefore no log2 normalization needed

exprs(gset) <- normalizeBetweenArrays(exprs(gset))
boxplot(exprs(gset))

# 2024/11/12: I elected to do quantile normalization because this gave me a larger list of "persistent" genes. Could justify that it "better captures the variation between groups" etc

min(exprs(gset))
## [1] -0.2140344
max(exprs(gset))
## [1] 14.59784
## Plot PCA ##
colz <- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for GSE7895",
        col = colz,
        pch = 1
        #labels = gs
        )

legend("topright", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

# No separation, all mixed up. This isn't a good look.

4.2.2 Investigating sources of variation that could account for lack of separation of samples by smoking status

Extract and format phenotypic data

library(stringr)

phenotypic_data <- pData(gset)  # Extract phenotypic data

# List of column names I want to keep and clean up into usable labels
columns_to_find <- c("characteristics_ch1.1", "group")

# Get the column indexes
indexes <- sapply(columns_to_find, function(col_name) which(names(phenotypic_data) == col_name))
indexes <- unlist(indexes)

phenotypic_data <- phenotypic_data[,c(indexes)]

# Extract Age
phenotypic_data$age <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Age:)\\d+"))

# Extract Packyears
phenotypic_data$packyears <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Packyears:)\\d+"))

# Extract Time Since Quit Smoking (months)
phenotypic_data$TSQ_months <- as.numeric(str_extract(phenotypic_data$characteristics_ch1.1, "(?<=Time Since Quit Smoking \\(months\\):)\\d+"))

# Delete the original column with the unseparated info
phenotypic_data <- phenotypic_data[,-1]

# Convert the NA values for packyears for never smokers to zero (this makes sense since the never smokers have 0 pack years)
phenotypic_data$packyears[phenotypic_data$group=="never_smoker"] <- 0

# Convert the NA values for TSQ_months to zero for current smokers (again makes sense)
phenotypic_data$TSQ_months[phenotypic_data$group=="current_smoker"] <- 0

# Make column to denote just former smoking status for the linear model
phenotypic_data$former_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "former_smoker"))

# Make column to denote just current smoking status for the linear model
phenotypic_data$current_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "current_smoker"))

# Make column to denote just never smoking status for the linear model
phenotypic_data$never_smoking_status <- as.factor(as.numeric(phenotypic_data$group == "never_smoker"))

4.2.3 Plot PCA using other phenotypic data

## Plot PCA using age to define color
# Create a gradient color palette (light blue to dark blue)
palette <- colorRampPalette(c("lightblue", "darkblue"))

## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)]  # Map ages to gradient colors
plotMDS(exprs(gset),
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher age)",
        col = colz_age,
        pch = 1
        )
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age), 
       fill = palette(2), 
       title = "Age")

# Does not seem to be an age effect




### Plot PCA of packyears ###

# Excluding packyears of zero (never smokers)
pheno_packyears <- phenotypic_data[phenotypic_data$packyears!=0,]
exprs_packyears <- as.data.frame(exprs(gset)) %>%
  dplyr::select(rownames(pheno_packyears))

colz_packyears <-  palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_packyears,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher packyears)",
        col = colz_packyears,
        pch = 1
        #labels = gs
        )
# Add a color bar for packyears
legend("topright", legend = range(pheno_packyears$packyears), 
       fill = palette(2), 
       title = "Packyears")

## Does not seem to be packyears effect


### Plot PCA of time since quitting ###
pheno_tsq <- phenotypic_data[!is.na(phenotypic_data$TSQ_months),]
exprs_tsq <- as.data.frame(exprs(gset)) %>%
  dplyr::select(rownames(pheno_tsq))

colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)]  # Map packyears to gradient colors
plotMDS(exprs_tsq,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ more time since quitting)",
        col = colz_TSQ,
        pch = 1
        #labels = gs
        )
legend("topright", legend = range(pheno_tsq$TSQ), 
       fill = palette(2), 
       title = "TSQ")

## Does not seem to be TSQ effect

This lack of separation by smoking status and inability to account for the variation by other phenotypic variables could potentially be problematic, but I propose/hypothesize that if the genes determined to be “persistent” can differentiate between the groups as expected in PCA, it will be evidence that the results are valid despite the groups not being differentiated by all the genes taken as a whole.

Note that I began trying to do this analysis accounting for pack years and TSQ (see other script), but for now I am just looking at the smoking status comparisons alone.

4.3 Differential expression analysis

v <- vooma(gset, design, plot=T)

v$genes <- fData(gset) # attach gene annotations


# fit linear model
fit  <- lmFit(v)

# set up contrasts of interest and recalculate model coefficients
#cts <- c(paste(groups[1],"-",groups[2],sep=""), paste(groups[1],"-",groups[3],sep=""), paste(groups[2],"-",groups[3],sep=""))
#cont.matrix <- makeContrasts(contrasts=cts, levels=design)
cont.matrix <- makeContrasts(
  CS_vs_NS = current_smoker - never_smoker,
  FS_vs_NS = former_smoker - never_smoker,
  CS_vs_FS = current_smoker - former_smoker,
  levels = design
)

fit2 <- contrasts.fit(fit, cont.matrix)


# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, proportion = 0.01) # Proportion is "assumed proportion of genes which are differentially expressed"

4.4 Select “persistent” DEGs, and basic filter (keep lower FDR of duplicates, apply FDR < 0.05 cutoff)

library(dplyr)
library(VennDiagram)

## Separate out genes that are DEGS in CS vs NS and FS vs NS

## Note: I have decided not to filter out genes that are significantly different between CS and FS because I realized that doesn't make logical sense.

# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", 
                  p.value=0.05, 
                  lfc=0)
# Note that this already has a p-value cutoff of 0.05 unlike my other datasets, but this is ok because this is the least stringent possible cutoff we will use anyway

# Venn diagram of results
vennDiagram(dT)

# Select the genes differentially expressed in both CS_vs_NS and FS_vs_NS
dT_persistent <- dT %>%
  as.data.frame(.) %>%
  filter(CS_vs_NS != 0) %>% # Differentially expressed in CS vs NS
  filter(CS_vs_NS == FS_vs_NS)# Differentially expressed, same sign in FS vs NS
nrow(dT_persistent) # 128 genes indeed
## [1] 128
# Get the toptable format for all genes
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) # Inf shows all the significant genes

# Filter to the "persistent" genes
tT_persistent <- tT %>%
  filter(ID %in% rownames(dT_persistent))

# Filter out blanks, keep lower FDR of ties
tT_persistent <- tT_persistent %>%
  filter(Gene.symbol != "") %>% # Remove blank gene symbols
  group_by(Gene.symbol) %>%
  slice_min(adj.P.Val, with_ties = TRUE) %>% 
  # For probesets mapping to same gene, keep one with lowest FDR. Keep ties for now to check later.
  ungroup()
nrow(tT_persistent)
## [1] 116
# Checking for ties
ties <- tT_persistent%>%
  group_by(Gene.symbol) %>%
  filter(n() > 1) %>%
  ungroup()
print(ties)
## # A tibble: 2 × 28
##   ID      Gene.title Gene.symbol Gene.ID UniGene.title UniGene.symbol UniGene.ID
##   <chr>   <chr>      <chr>       <chr>   <chr>         <chr>          <chr>     
## 1 214303… mucin 5AC… MUC5AC      4586    ""            ""             ""        
## 2 214385… mucin 5AC… MUC5AC      4586    ""            ""             ""        
## # ℹ 21 more variables: Nucleotide.Title <chr>, GI <int>,
## #   GenBank.Accession <chr>, Platform_CLONEID <lgl>, Platform_ORF <lgl>,
## #   Platform_SPOTID <chr>, Chromosome.location <chr>,
## #   Chromosome.annotation <chr>, GO.Function <chr>, GO.Process <chr>,
## #   GO.Component <chr>, GO.Function.ID <chr>, GO.Process.ID <chr>,
## #   GO.Component.ID <chr>, CS_vs_NS <dbl>, FS_vs_NS <dbl>, CS_vs_FS <dbl>,
## #   AveExpr <dbl>, F <dbl>, P.Value <dbl>, adj.P.Val <dbl>
# As there is a tie with MUCA5 I will remove the MUCA5 probe with an "x" label for cross-reactivity
tT_persistent <- tT_persistent %>% filter (ID != "214303_x_at")

#Pick the columns we care about
GSE7895_persistent_DEGs <- tT_persistent %>%
  dplyr::select(., Gene.symbol, CS_vs_NS, FS_vs_NS, CS_vs_FS, adj.P.Val) %>%
  dplyr::rename(., Gene = Gene.symbol, CS_NS_A2 = CS_vs_NS, FS_NS_A2 = FS_vs_NS, CS_FS_A2 = CS_vs_FS, FDR_A2 = adj.P.Val)

# Save output
#write.table(GSE7895_persistent_DEGs, "../2_Outputs/1_Airway_DEGs/GSE7895_persistent_DEGs_20241204.txt")

4.5 Extra check (PCA for stratification of samples based on persistent genes)

## Filter exprs to the "persistent" genes
exprs_persistent <- as.data.frame(exprs(gset)) %>%
  filter(rownames(.) %in% tT_persistent$ID)

## Plot PCA ##
colz<- as.numeric(as.factor(gs)) # Get color values from group
plotMDS(exprs_persistent,
        gene.selection = "common",
        main = "PCA for GSE7895 with persistent genes",
        col = colz,
        pch = 1
        #labels = gs
        )

legend("topright", legend = levels(as.factor(gs)), 
       fill = unique(colz), 
       title = "Smoking status")

# You can see more separation happening, but I would expect to see current and former smokers more mixed together, whereas we see former and never smokers more mixed together. Hmm okay, interesting at least.

# Might be good to check on the age, packyears and TSQ here as well?


## Plot PCA of age ##
colz_age <- palette(length(phenotypic_data$age))[rank(phenotypic_data$age)]  # Map ages to gradient colors
plotMDS(exprs_persistent,
        gene.selection = "common",
        main = "PCA for GSE7895 (darker blue ~ higher age)",
        col = colz_age,
        pch = 16
        )
# Add a color bar for age
legend("topright", legend = range(phenotypic_data$age), 
       fill = palette(2), 
       title = "Age")

# Does not seem to be an age effect

### Plot PCA of packyears ###

# Excluding packyears of zero (never smokers)
exprs_persistent_packyears <- as.data.frame(exprs_persistent) %>%
  dplyr::select(rownames(pheno_packyears))

colz_packyears <-  palette(length(pheno_packyears$packyears))[rank(pheno_packyears$packyears)] # Map packyears to gradient colors
plotMDS(exprs_persistent_packyears,
        gene.selection = "common",
        main = "PCA for GSE7895 persistent genes (darker blue ~ higher packyears)",
        col = colz_packyears,
        pch = 16
        #labels = gs
        )
# Add a color bar for packyears
legend("bottomleft", legend = range(pheno_packyears$packyears), 
       fill = palette(2), 
       title = "Packyears")

## Maybe some sort of packyears effect happening, not obviously so


### Plot PCA of time since quitting ###
exprs_persistent_tsq <- as.data.frame(exprs_persistent) %>%
  dplyr::select(rownames(pheno_tsq))

colz_TSQ <- palette(length(pheno_tsq$TSQ))[rank(pheno_tsq$TSQ)]  # Map packyears to gradient colors
plotMDS(exprs_persistent_tsq ,
        gene.selection = "common",
        main = "PCA for GSE7895 persistent genes (darker blue ~ more months since quitting)",
        col = colz_TSQ,
        pch = 16
        #labels = gs
        )
legend("bottomleft", legend = range(pheno_tsq$TSQ), 
       fill = palette(2), 
       title = "TSQ")

## Maybe some TSQ effect but not super obvious